{
 "cells": [
  {
   "cell_type": "code",
   "execution_count": 8,
   "metadata": {
    "collapsed": false
   },
   "outputs": [],
   "source": [
    "import numpy as np\n",
    "import pandas as pd\n",
    "from tqdm import tqdm\n",
    "import multiprocessing as mp\n",
    "import time\n",
    "import json\n",
    "\n",
    "# My implementation of a Hidden Markov Model\n",
    "from HiddenMarkovModel import HiddenMarkovModel"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 9,
   "metadata": {
    "collapsed": false
   },
   "outputs": [],
   "source": [
    "# joint = pd.read_hdf(\"C:/Users/tabzr/Documents/CMU Dataset/r4.1/features.h5\", \"table\")\n",
    "joint = pd.read_hdf(\"/home/tabz/Documents/CMU_Dataset/r4.2/r42_features_complex.h5\", \"table\")"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 10,
   "metadata": {
    "collapsed": false
   },
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "There are 1000 users\n",
      "Using 16 features\n"
     ]
    }
   ],
   "source": [
    "users = np.unique(joint.index.values)\n",
    "print(\"There are\", users.size, \"users\")\n",
    "num_features = np.unique(joint[\"feature\"].values).size\n",
    "print(\"Using\", num_features, \"features\")"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 11,
   "metadata": {
    "collapsed": false
   },
   "outputs": [],
   "source": [
    "# The initial configurations for the hidden markov model\n",
    "num_states = 10\n",
    "num_symbols = num_features\n",
    "deviation_from_uniform = 1/2\n",
    "# Seed the rng for reproducibility\n",
    "rng_seed = 42\n",
    "np.random.seed(rng_seed)\n",
    "\n",
    "def init_matrices():\n",
    "    # Start probabilities\n",
    "    startprobs = np.zeros(num_states)\n",
    "    startprobs[0] = 1\n",
    "    startprobs.fill(1/num_states)\n",
    "    startprobs += np.random.rand(num_states) * deviation_from_uniform\n",
    "    startprobs /= np.sum(startprobs)\n",
    "\n",
    "    # Transition probabilities\n",
    "    transprobs = np.empty((num_states,num_states))\n",
    "    transprobs.fill(1/num_states)\n",
    "    transprobs += np.random.rand(num_states,num_states) * deviation_from_uniform\n",
    "    transprobs /= transprobs.sum(axis=1)[:,np.newaxis]\n",
    "\n",
    "    # Emission probabilities\n",
    "    emissionprobs = np.empty((num_states,num_symbols))\n",
    "    emissionprobs.fill(1/num_symbols)\n",
    "    emissionprobs += np.random.rand(num_states,num_symbols) * deviation_from_uniform\n",
    "    emissionprobs /= emissionprobs.sum(axis=1)[:,np.newaxis]\n",
    "    \n",
    "    return startprobs, transprobs, emissionprobs"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 12,
   "metadata": {
    "collapsed": true
   },
   "outputs": [],
   "source": [
    "def compute_probs(user_df):\n",
    "    dayGrouping = pd.Grouper(key=\"date\", freq=\"1D\")\n",
    "    weekGrouping = pd.Grouper(key=\"date\", freq=\"1W\")\n",
    "    timeGrouping = user_df.groupby(weekGrouping)\n",
    "\n",
    "#     print(\"Starting on user: \", user)\n",
    "    s,t,e = init_matrices()\n",
    "    model = HiddenMarkovModel.HMM(num_states, t,e,s)\n",
    "\n",
    "    trainingPeriod = 4\n",
    "    timesTrained = 0\n",
    "\n",
    "    logProbScores = []\n",
    "\n",
    "    for name, group in timeGrouping:\n",
    "\n",
    "        #The sequence for the time grouping we are considering\n",
    "        seq = group[\"feature\"].values\n",
    "        \n",
    "        if len(seq) < 1:\n",
    "            # If there is no activity for this week \n",
    "            logProbScores.append(0)\n",
    "            continue\n",
    "\n",
    "        if timesTrained > trainingPeriod:\n",
    "            logProb = model.sequence_log_probability(seq)\n",
    "            logProbScores.append(-logProb)\n",
    "\n",
    "            #Train the model on the sequence we have just seen\n",
    "        model.learn(seq, max_iters=20, threshold=0.01, restart_threshold=0.1,max_restarts=5, inertia=0.5)\n",
    "        timesTrained+=1\n",
    "\n",
    "    return logProbScores"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 13,
   "metadata": {
    "collapsed": false
   },
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "Queueing up jobs\n"
     ]
    },
    {
     "name": "stderr",
     "output_type": "stream",
     "text": [
      "100%|██████████| 1000/1000 [22:35<00:00,  1.29s/it]"
     ]
    },
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "Progress on those jobs:\n"
     ]
    },
    {
     "name": "stderr",
     "output_type": "stream",
     "text": [
      "\n",
      "100%|██████████| 1000/1000 [1:29:20<00:00,  4.24s/it]\n"
     ]
    }
   ],
   "source": [
    "userScores = {}\n",
    "\n",
    "def setInDict(u):\n",
    "    def z(r):\n",
    "        userScores[u] = r\n",
    "    return z\n",
    "\n",
    "pool = mp.Pool(processes=8)\n",
    "print(\"Queueing up jobs\", flush=True)\n",
    "for user in tqdm(users):\n",
    "    pool.apply_async(compute_probs, args=(joint.loc[ joint.index == user ],), callback=setInDict(user))\n",
    "#     compute_probs(joint.loc[user])\n",
    "print(\"Progress on those jobs:\", flush=True)    \n",
    "done = 0\n",
    "for i in tqdm(users):\n",
    "    while done >= len(userScores):\n",
    "        time.sleep(3)\n",
    "    done += 1\n",
    "pool.close()\n",
    "pool.join()"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 14,
   "metadata": {
    "collapsed": false
   },
   "outputs": [],
   "source": [
    "# Dump the userScores because it takes >1hr to calculate\n",
    "json.dump(userScores, open(\"userScores_complex_r42_50_inertia.json\", \"w\"))"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {
    "collapsed": true
   },
   "outputs": [],
   "source": []
  }
 ],
 "metadata": {
  "kernelspec": {
   "display_name": "Python 3",
   "language": "python",
   "name": "python3"
  },
  "language_info": {
   "codemirror_mode": {
    "name": "ipython",
    "version": 3
   },
   "file_extension": ".py",
   "mimetype": "text/x-python",
   "name": "python",
   "nbconvert_exporter": "python",
   "pygments_lexer": "ipython3",
   "version": "3.5.1"
  }
 },
 "nbformat": 4,
 "nbformat_minor": 0
}
